Treamid - PDBBind¶
Filter Data¶
Seperating summary_new.csv file to different Recursion Step Done!
In [1]:
from sklearn.metrics import r2_score
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
# Function to filter the CSV file
def filter_csv(input_file_path, output_file_path, recursion_step='1/5'):
"""
Filters rows in a CSV file where 'Recursion Step Done' equals a specific value.
Parameters:
- input_file_path: Path to the input CSV file.
- output_file_path: Path where the filtered CSV file will be saved.
- recursion_step: The recursion step value to filter by (default is '1/5').
"""
# Load the CSV file
data = pd.read_csv(input_file_path)
# Filter the dataframe
filtered_data = data[
(data['Recursion Step Done'] == recursion_step) &
(data['Binding Affinity (kcal/mol)'] < 20) &
(data['Number of clashes'] < 20) &
(data['Strain Energy'] < 20) &
(data['Confidence Score'] > -10)
]
# Save the filtered data to a new CSV file
filtered_data.to_csv(output_file_path, index=False)
print(f"Filtered data for recursion step {recursion_step} saved to: {output_file_path}")
In [2]:
# Base directory and filename patterns
base_dir = '/fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/'
input_filename = 'summary_new.csv'
output_filename_pattern = 'filtered_summary_new_rec_{}.csv'
# Full input file path
input_file_path = base_dir + input_filename
# Loop over the recursion steps
for step in range(1, 6):
recursion_step = f'{step}/5'
output_file_path = base_dir + output_filename_pattern.format(step)
filter_csv(input_file_path, output_file_path, recursion_step=recursion_step)
Filtered data for recursion step 1/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_summary_new_rec_1.csv Filtered data for recursion step 2/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_summary_new_rec_2.csv Filtered data for recursion step 3/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_summary_new_rec_3.csv Filtered data for recursion step 4/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_summary_new_rec_4.csv Filtered data for recursion step 5/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_summary_new_rec_5.csv
Read Each Seperated Files¶
In [3]:
# Loop over the recursion steps and read each file
for step in range(1, 6):
file_path = f'{base_dir}{output_filename_pattern.format(step)}'
data = pd.read_csv(file_path)
num_rows = data.shape[0]
print(f'Total number of data points for recursion step {step}/5: {num_rows}')
print(f'First few rows of the dataframe for recursion step {step}/5:')
display(data.head())
Total number of data points for recursion step 1/5: 12809 First few rows of the dataframe for recursion step 1/5:
| Protein ID | Recursion Step Done | Ligand Description | Binding Affinity (kcal/mol) | Number of clashes | Strain Energy | Confidence Score | Rank of sdf | _timestamp | _runtime | _step | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4a7b_protein_processed | 1/5 | NaN | -7.743852 | 7 | 9.233238 | -0.15 | rank1_confidence-0.15 | 1.712196e+09 | 90.840397 | 8 |
| 1 | 4mt9_protein_processed | 1/5 | NaN | -6.984931 | 7 | 9.727376 | -0.42 | rank1_confidence-0.42 | 1.712193e+09 | 73.990098 | 8 |
| 2 | 4msg_protein_processed | 1/5 | NaN | -7.803800 | 14 | 5.416456 | -0.21 | rank1_confidence-0.21 | 1.712192e+09 | 72.087428 | 8 |
| 3 | 4lwu_protein_processed | 1/5 | NaN | -1.004857 | 15 | 5.794271 | -0.96 | rank1_confidence-0.96 | 1.712184e+09 | 53.928325 | 8 |
| 4 | 3l5f_protein_processed | 1/5 | NaN | -5.983912 | 4 | 9.115500 | -0.06 | rank1_confidence-0.06 | 1.712203e+09 | 98.465181 | 8 |
Total number of data points for recursion step 2/5: 6713 First few rows of the dataframe for recursion step 2/5:
| Protein ID | Recursion Step Done | Ligand Description | Binding Affinity (kcal/mol) | Number of clashes | Strain Energy | Confidence Score | Rank of sdf | _timestamp | _runtime | _step | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3osw_protein_processed | 2/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -6.806013 | 5 | 11.128419 | -0.77 | rank1_confidence-0.77 | 1.712178e+09 | 47.703313 | 8 |
| 1 | 2xu5_protein_processed | 2/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -5.188495 | 2 | 7.886749 | 0.53 | rank1_confidence0.53 | 1.712178e+09 | 41.020215 | 8 |
| 2 | 6skb_protein_processed | 2/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -6.449235 | 7 | 10.282784 | -0.49 | rank1_confidence-0.49 | 1.712190e+09 | 64.057879 | 8 |
| 3 | 5jah_protein_processed | 2/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -6.638763 | 17 | 12.186867 | -0.11 | rank1_confidence-0.11 | 1.712188e+09 | 58.801625 | 8 |
| 4 | 3oyq_protein_processed | 2/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -4.610049 | 1 | 9.622163 | -0.56 | rank1_confidence-0.56 | 1.712185e+09 | 57.664332 | 8 |
Total number of data points for recursion step 3/5: 4589 First few rows of the dataframe for recursion step 3/5:
| Protein ID | Recursion Step Done | Ligand Description | Binding Affinity (kcal/mol) | Number of clashes | Strain Energy | Confidence Score | Rank of sdf | _timestamp | _runtime | _step | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5lz9_protein_processed | 3/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -7.200333 | 10 | 10.013039 | -0.05 | rank1_confidence-0.05 | 1.712193e+09 | 73.473761 | 8 |
| 1 | 3lea_protein_processed | 3/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -5.868697 | 15 | 7.508655 | -0.68 | rank1_confidence-0.68 | 1.712173e+09 | 31.038599 | 8 |
| 2 | 5qbv_protein_processed | 3/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -5.375995 | 5 | 8.855248 | 0.23 | rank1_confidence0.23 | 1.712169e+09 | 19.483277 | 8 |
| 3 | 2c9d_protein_processed | 3/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -7.083409 | 16 | 7.737979 | -0.35 | rank1_confidence-0.35 | 1.712175e+09 | 39.159119 | 8 |
| 4 | 6c28_protein_processed | 3/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -3.430942 | 6 | 3.496771 | -1.25 | rank1_confidence-1.25 | 1.712179e+09 | 43.019742 | 8 |
Total number of data points for recursion step 4/5: 3528 First few rows of the dataframe for recursion step 4/5:
| Protein ID | Recursion Step Done | Ligand Description | Binding Affinity (kcal/mol) | Number of clashes | Strain Energy | Confidence Score | Rank of sdf | _timestamp | _runtime | _step | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4bfy_protein_processed | 4/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -6.706119 | 19 | 9.650411 | -0.49 | rank1_confidence-0.49 | 1.712171e+09 | 26.649156 | 8 |
| 1 | 2v86_protein_processed | 4/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -4.772636 | 8 | 4.767310 | -0.84 | rank1_confidence-0.84 | 1.712194e+09 | 68.381843 | 8 |
| 2 | 3ant_protein_processed | 4/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -6.526974 | 5 | 5.131837 | -0.69 | rank1_confidence-0.69 | 1.712192e+09 | 70.197422 | 8 |
| 3 | 3f1a_protein_processed | 4/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -2.818593 | 15 | 11.909609 | -0.81 | rank1_confidence-0.81 | 1.712175e+09 | 35.744654 | 8 |
| 4 | 6cj5_protein_processed | 4/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -6.489578 | 1 | 9.185916 | -0.58 | rank1_confidence-0.58 | 1.712171e+09 | 27.180906 | 8 |
Total number of data points for recursion step 5/5: 2763 First few rows of the dataframe for recursion step 5/5:
| Protein ID | Recursion Step Done | Ligand Description | Binding Affinity (kcal/mol) | Number of clashes | Strain Energy | Confidence Score | Rank of sdf | _timestamp | _runtime | _step | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3f18_protein_processed | 5/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -6.151046 | 3 | 6.017198 | -0.32 | rank1_confidence-0.32 | 1.712175e+09 | 33.212631 | 8 |
| 1 | 1sts_protein_processed | 5/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -5.828997 | 8 | 12.178389 | -0.87 | rank1_confidence-0.87 | 1.712170e+09 | 25.128030 | 8 |
| 2 | 3b8q_protein_processed | 5/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -6.206666 | 15 | 9.867155 | -0.16 | rank1_confidence-0.16 | 1.712179e+09 | 46.489564 | 8 |
| 3 | 4az2_protein_processed | 5/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -7.062797 | 5 | 5.276325 | 0.35 | rank1_confidence0.35 | 1.712173e+09 | 31.939743 | 8 |
| 4 | 5xjm_protein_processed | 5/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -5.818565 | 7 | 7.974301 | -0.30 | rank1_confidence-0.30 | 1.712181e+09 | 60.047276 | 8 |
Data Distributions¶
In [4]:
# Function to plot distributions of metrics from a given dataset
def plot_metrics_distributions(data, figure_title):
# Set the overall aesthetics
sns.set(style="whitegrid", palette="pastel")
# Prepare the figure with a larger size for better readability
fig, axes = plt.subplots(2, 2, figsize=(16, 12)) # Increased figure size for better legibility
fig.suptitle(figure_title, fontsize=20, fontweight='bold')
# Define a common bin size for comparable scales
common_bins = 120 # Example bin size, adjust based on data distribution
# List of metrics to plot
metrics = ['Binding Affinity (kcal/mol)', 'Number of clashes', 'Strain Energy', 'Confidence Score']
colors = ['olive', 'gold', 'teal', 'purple']
# Plotting distributions
for i, metric in enumerate(metrics):
ax = axes.flat[i]
sns.histplot(data=data, x=metric, kde=True, ax=ax, color=colors[i], bins=common_bins)
ax.set_title(metric, fontsize=14)
ax.grid(True)
# Calculate mean and std, and annotate on the plots
mean_value = data[metric].mean()
std_value = data[metric].std()
ax.axvline(mean_value, color='magenta', linestyle='dashed', linewidth=2)
ax.text(mean_value + std_value, 0.7, f'Mean: {mean_value:.2f}\nStd: {std_value:.2f}', color='magenta', ha='left')
# Adjust layout for a clean look and ensure the titles and labels don't overlap
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
# Show plot
plt.show()
# Loop over the recursion steps and plot for each step
for step in range(1, 6):
file_path = f'{base_dir}{output_filename_pattern.format(step)}'
data = pd.read_csv(file_path)
num_rows = data.shape[0]
print(f'Total number of data points for recursion step {step}/5: {num_rows}')
plot_metrics_distributions(data, f'Distribution of Various Metrics for Recursion Step {step}/5')
Total number of data points for recursion step 1/5: 12809
Total number of data points for recursion step 2/5: 6713
Total number of data points for recursion step 3/5: 4589
Total number of data points for recursion step 4/5: 3528
Total number of data points for recursion step 5/5: 2763
Data Distribution Ranges¶
In [5]:
# Function to plot histograms with readable ranges for a given dataset
def plot_histograms_with_readable_ranges(data, title_suffix):
# Enhancing overall aesthetics
sns.set(style="whitegrid", palette="pastel")
# Initializing a larger figure for clearer detail
fig, axes = plt.subplots(4, 1, figsize=(10, 15)) # Using a vertical layout for better x-axis label readability
# Titles and customization for improved readability
fig.suptitle(f'Distribution of Selected Metrics with Readable Ranges - {title_suffix}', fontsize=20, fontweight='bold')
# Adjusting bin sizes and x-axis limits for clarity
metrics_info = {
'Binding Affinity (kcal/mol)': {'bins': 20, 'ax': axes[0], 'xlim': (-20, 20)},
'Strain Energy': {'bins': 20, 'ax': axes[1], 'xlim': (-20, 20)},
'Number of clashes': {'bins': 20, 'ax': axes[2], 'xlim': (0, 20)},
'Confidence Score': {'bins': 20, 'ax': axes[3], 'xlim': (-10, 10)}
}
for metric, info in metrics_info.items():
sns.histplot(data=data, x=metric, kde=True, ax=info['ax'], bins=info['bins'], color='skyblue')
info['ax'].set_title(metric, fontsize=14)
info['ax'].set_xlabel(metric, fontsize=12)
info['ax'].set_ylabel('Frequency', fontsize=12)
info['ax'].set_xlim(info['xlim']) # Adjusting x-axis limits for focusing on interesting ranges
info['ax'].tick_params(axis='x', labelsize=10, rotation=45)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
# Loop over the recursion steps and plot for each step
for step in range(1, 6):
file_path = f'{base_dir}{output_filename_pattern.format(step)}'
data = pd.read_csv(file_path)
num_rows = data.shape[0]
print(f'Total number of data points for recursion step {step}/5: {num_rows}')
plot_histograms_with_readable_ranges(data, f'Recursion Step {step}/5')
Total number of data points for recursion step 1/5: 12809
Total number of data points for recursion step 2/5: 6713
Total number of data points for recursion step 3/5: 4589
Total number of data points for recursion step 4/5: 3528
Total number of data points for recursion step 5/5: 2763
Correlation Between Confidence Score vs Physical Scores (Binding Affinity, Strain Energy, Number of Clashes)¶
- This is all data without selecting range values
In [6]:
def plot_correlations(data, title_suffix):
# Plotting correlations for the specified pairs of metrics
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle(f'Correlations Between Metrics - {title_suffix}', fontsize=20, fontweight='bold')
# Binding Affinity (kcal/mol) vs Confidence Score
sns.scatterplot(data=data, x='Binding Affinity (kcal/mol)', y='Confidence Score', ax=axes[0], color='skyblue')
axes[0].set_title('Binding Affinity vs Confidence Score', fontsize=14)
axes[0].set_xlabel('Binding Affinity (kcal/mol)', fontsize=12)
axes[0].set_ylabel('Confidence Score', fontsize=12)
# Strain Energy vs Confidence Score
sns.scatterplot(data=data, x='Strain Energy', y='Confidence Score', ax=axes[1], color='lightgreen')
axes[1].set_title('Strain Energy vs Confidence Score', fontsize=14)
axes[1].set_xlabel('Strain Energy', fontsize=12)
axes[1].set_ylabel('Confidence Score', fontsize=12)
# Number of clashes vs Confidence Score
sns.scatterplot(data=data, x='Number of clashes', y='Confidence Score', ax=axes[2], color='salmon')
axes[2].set_title('Number of Clashes vs Confidence Score', fontsize=14)
axes[2].set_xlabel('Number of Clashes', fontsize=12)
axes[2].set_ylabel('Confidence Score', fontsize=12)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
# Loop over the recursion steps and plot correlations for each step
for step in range(1, 6):
file_path = f'{base_dir}{output_filename_pattern.format(step)}'
data = pd.read_csv(file_path)
num_rows = data.shape[0]
print(f'Total number of data points for recursion step {step}/5: {num_rows}')
plot_correlations(data, f'Recursion Step {step}/5')
Total number of data points for recursion step 1/5: 12809
Total number of data points for recursion step 2/5: 6713
Total number of data points for recursion step 3/5: 4589
Total number of data points for recursion step 4/5: 3528
Total number of data points for recursion step 5/5: 2763
Correlation Between Confidence Score vs Physical Scores (Binding Affinity, Strain Energy, Number of Clashes)¶
- This is all data with selecting range values
In [7]:
def filter_and_plot_correlations(data, title_suffix):
# Filter the data for the given ranges
filtered_data = data[
(data['Confidence Score'] >= -5) & (data['Confidence Score'] <= 5) &
(data['Binding Affinity (kcal/mol)'] >= -20) & (data['Binding Affinity (kcal/mol)'] <= 20) &
(data['Strain Energy'] >= -20) & (data['Strain Energy'] <= 20)
]
# Plotting correlations for the specified pairs of metrics with all adjusted ranges
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle(f'Correlations Between Metrics (Adjusted Ranges) - {title_suffix}', fontsize=20, fontweight='bold')
# Binding Affinity (kcal/mol) vs Confidence Score
sns.scatterplot(data=filtered_data, x='Binding Affinity (kcal/mol)', y='Confidence Score', ax=axes[0], color='skyblue')
axes[0].set_title('Binding Affinity vs Confidence Score', fontsize=14)
# Strain Energy vs Confidence Score
sns.scatterplot(data=filtered_data, x='Strain Energy', y='Confidence Score', ax=axes[1], color='lightgreen')
axes[1].set_title('Strain Energy vs Confidence Score', fontsize=14)
# Number of clashes vs Confidence Score
sns.scatterplot(data=filtered_data, x='Number of clashes', y='Confidence Score', ax=axes[2], color='salmon')
axes[2].set_title('Number of Clashes vs Confidence Score', fontsize=14)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
# Loop over the recursion steps and apply the filter and plotting function
for step in range(1, 6):
file_path = f'{base_dir}{output_filename_pattern.format(step)}'
data = pd.read_csv(file_path)
num_rows = data.shape[0]
print(f'Total number of data points for recursion step {step}/5: {num_rows}')
filter_and_plot_correlations(data, f'Recursion Step {step}/5')
Total number of data points for recursion step 1/5: 12809
Total number of data points for recursion step 2/5: 6713
Total number of data points for recursion step 3/5: 4589
Total number of data points for recursion step 4/5: 3528
Total number of data points for recursion step 5/5: 2763
Correlation Between Confidence Score vs Physical Scores (Binding Affinity, Strain Energy, Number of Clashes)¶
- This is all data with selecting range values
- Best lines with
R^2values
In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import statsmodels.api as sm
def plot_correlation_with_fit(data, recursion_step):
# Filter data based on the specified ranges
filtered_data = data[
(data['Confidence Score'] >= -5) & (data['Confidence Score'] <= 5) &
(data['Binding Affinity (kcal/mol)'] >= -20) & (data['Binding Affinity (kcal/mol)'] <= 20) &
(data['Strain Energy'] >= -20) & (data['Strain Energy'] <= 20)
]
# Preparing the figure
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle(f'Correlations and Linear Fits (Adjusted Ranges) - Recursion Step {recursion_step}', fontsize=20, fontweight='bold')
# Defining metrics pairs for correlation
metrics_pairs = [
('Binding Affinity (kcal/mol)', 'Confidence Score'),
('Strain Energy', 'Confidence Score'),
('Number of clashes', 'Confidence Score')
]
colors = ['skyblue', 'lightgreen', 'salmon']
for i, (x_metric, y_metric) in enumerate(metrics_pairs):
# Scatter plot
sns.scatterplot(data=filtered_data, x=x_metric, y=y_metric, ax=axes[i], color=colors[i])
# Robust linear model
X = sm.add_constant(filtered_data[x_metric]) # Adding a constant for the intercept
robust_model = sm.RLM(filtered_data[y_metric], X, M=sm.robust.norms.HuberT())
robust_results = robust_model.fit()
# Calculate R^2 score for robust model
y_pred_robust = robust_results.predict(X)
r2_robust = r2_score(filtered_data[y_metric], y_pred_robust)
# Linear model
coef = np.polyfit(filtered_data[x_metric], filtered_data[y_metric], 1)
poly1d_fn = np.poly1d(coef)
# Calculate R^2 score for linear model
y_pred_linear = poly1d_fn(filtered_data[x_metric])
r2_linear = r2_score(filtered_data[y_metric], y_pred_linear)
# Plot robust fit and linear fit
sns.lineplot(x=filtered_data[x_metric], y=y_pred_robust, ax=axes[i], color='red', label=f'Robust R² = {r2_robust:.2f}')
sns.lineplot(x=filtered_data[x_metric], y=poly1d_fn(filtered_data[x_metric]), ax=axes[i], color='blue', label=f'Linear R² = {r2_linear:.2f}', linestyle='--')
axes[i].set_title(f'{x_metric} vs {y_metric}', fontsize=14)
axes[i].set_xlabel(x_metric, fontsize=12)
axes[i].set_ylabel(y_metric, fontsize=12)
axes[i].legend()
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
# Loop over the recursion steps and plot for each step
for step in range(1, 6):
file_path = f'{base_dir}{output_filename_pattern.format(step)}'
data = pd.read_csv(file_path)
num_rows = data.shape[0]
print(f'Total number of data points for recursion step {step}/5: {num_rows}')
plot_correlation_with_fit(data, step)
Total number of data points for recursion step 1/5: 12809
Total number of data points for recursion step 2/5: 6713
Total number of data points for recursion step 3/5: 4589
Total number of data points for recursion step 4/5: 3528
Total number of data points for recursion step 5/5: 2763
Comparison of Same Size Recursived Proteins¶
Now we are comparising the the Proteins which completed 5/5 Recursion step. First we seperate the results in Recursion Step Done values. Each csv file must be same number of Protein ID!
In [9]:
def filter_data_for_recursion_step(recursion_step, base_dir, filtered_proteins_file, summary_file_template, output_file_template):
# Load the file with filtered Protein IDs
filtered_proteins_df = pd.read_csv(filtered_proteins_file)
filtered_proteins = filtered_proteins_df['Protein ID']
# Format the file name for the current recursion step
summary_file_path = summary_file_template.format(recursion_step)
# Load the summary data for the current recursion step
summary_data = pd.read_csv(summary_file_path)
filtered_data = data[
(data['Recursion Step Done'] == recursion_step) &
(data['Binding Affinity (kcal/mol)'] < 20) &
(data['Number of clashes'] < 20) &
(data['Strain Energy'] < 20) &
(data['Confidence Score'] > -10)
]
# Filter the summary data for the current recursion step
filtered_summary_new = summary_data[summary_data['Protein ID'].isin(filtered_proteins) &
(summary_data['Recursion Step Done'] == f'{recursion_step}/5') &
(summary_data['Binding Affinity (kcal/mol)'] < 1000)
]
# Save the filtered data to a new CSV file
output_file_path = output_file_template.format(recursion_step)
filtered_summary_new.to_csv(output_file_path, index=False)
print(f"Filtered data for recursion step {recursion_step}/5 saved to: {output_file_path}")
# Base directory and file patterns
base_dir = '/fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/'
filtered_proteins_file = base_dir + 'filtered_summary_new_rec_5.csv'
summary_file_template = base_dir + 'summary_new.csv'
output_file_template = base_dir + 'filtered_proteins_from_summary_rec{}.csv'
# Loop through each recursion step
for step in range(1, 6):
filter_data_for_recursion_step(step, base_dir, filtered_proteins_file, summary_file_template, output_file_template)
Filtered data for recursion step 1/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_proteins_from_summary_rec1.csv Filtered data for recursion step 2/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_proteins_from_summary_rec2.csv Filtered data for recursion step 3/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_proteins_from_summary_rec3.csv Filtered data for recursion step 4/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_proteins_from_summary_rec4.csv Filtered data for recursion step 5/5 saved to: /fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/filtered_proteins_from_summary_rec5.csv
Read Each Seperated Files (Each Shows Recursion Steps)¶
In [10]:
# Loop over the recursion steps and read each file
for step in range(1, 6):
file_path = f'{output_file_template.format(step)}'
data = pd.read_csv(file_path)
num_rows = data.shape[0]
print(f'Total number of data points for recursion step {step}/5: {num_rows}')
print(f'First few rows of the dataframe for recursion step {step}/5:')
display(data.head())
Total number of data points for recursion step 1/5: 2768 First few rows of the dataframe for recursion step 1/5:
| Protein ID | Recursion Step Done | Ligand Description | Binding Affinity (kcal/mol) | Number of clashes | Strain Energy | Confidence Score | Rank of sdf | _timestamp | _runtime | _step | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4mt9_protein_processed | 1/5 | NaN | -6.984931 | 7 | 9.727376 | -0.42 | rank1_confidence-0.42 | 1.712193e+09 | 73.990098 | 8 |
| 1 | 3l5f_protein_processed | 1/5 | NaN | -5.983912 | 4 | 9.115500 | -0.06 | rank1_confidence-0.06 | 1.712203e+09 | 98.465181 | 8 |
| 2 | 5f04_protein_processed | 1/5 | NaN | -8.444917 | 6 | 9.143929 | -0.55 | rank1_confidence-0.55 | 1.712182e+09 | 64.459544 | 8 |
| 3 | 5upe_protein_processed | 1/5 | NaN | -7.604168 | 4 | 9.029777 | -0.21 | rank1_confidence-0.21 | 1.712190e+09 | 106.675954 | 8 |
| 4 | 6gu4_protein_processed | 1/5 | NaN | -8.031396 | 12 | 5.412763 | 0.09 | rank1_confidence0.09 | 1.712186e+09 | 71.208169 | 8 |
Total number of data points for recursion step 2/5: 2766 First few rows of the dataframe for recursion step 2/5:
| Protein ID | Recursion Step Done | Ligand Description | Binding Affinity (kcal/mol) | Number of clashes | Strain Energy | Confidence Score | Rank of sdf | _timestamp | _runtime | _step | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2xu5_protein_processed | 2/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -5.188495 | 2 | 7.886749 | 0.53 | rank1_confidence0.53 | 1.712178e+09 | 41.020215 | 8 |
| 1 | 3zrl_protein_processed | 2/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -6.608416 | 8 | 9.125015 | -0.50 | rank1_confidence-0.50 | 1.712181e+09 | 52.921666 | 8 |
| 2 | 1k6c_protein_processed | 2/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -6.170387 | 11 | 6.698906 | -1.32 | rank1_confidence-1.32 | 1.712170e+09 | 18.585074 | 8 |
| 3 | 3kmm_protein_processed | 2/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -6.495392 | 9 | 6.488279 | -0.90 | rank1_confidence-0.90 | 1.712179e+09 | 47.001645 | 8 |
| 4 | 2i3i_protein_processed | 2/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -5.587364 | 5 | 8.241349 | -0.13 | rank1_confidence-0.13 | 1.712183e+09 | 49.401926 | 8 |
Total number of data points for recursion step 3/5: 2764 First few rows of the dataframe for recursion step 3/5:
| Protein ID | Recursion Step Done | Ligand Description | Binding Affinity (kcal/mol) | Number of clashes | Strain Energy | Confidence Score | Rank of sdf | _timestamp | _runtime | _step | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5lz9_protein_processed | 3/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -7.200333 | 10 | 10.013039 | -0.05 | rank1_confidence-0.05 | 1.712193e+09 | 73.473761 | 8 |
| 1 | 5qbv_protein_processed | 3/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -5.375995 | 5 | 8.855248 | 0.23 | rank1_confidence0.23 | 1.712169e+09 | 19.483277 | 8 |
| 2 | 1hvi_protein_processed | 3/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -5.589729 | 9 | 8.037342 | -1.11 | rank1_confidence-1.11 | 1.712193e+09 | 69.589330 | 8 |
| 3 | 2vj6_protein_processed | 3/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -7.898968 | 10 | 7.427671 | 0.20 | rank1_confidence0.20 | 1.712177e+09 | 43.562624 | 8 |
| 4 | 2nwl_protein_processed | 3/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -2.177144 | 11 | 5.790366 | -1.16 | rank1_confidence-1.16 | 1.712185e+09 | 61.523299 | 8 |
Total number of data points for recursion step 4/5: 2763 First few rows of the dataframe for recursion step 4/5:
| Protein ID | Recursion Step Done | Ligand Description | Binding Affinity (kcal/mol) | Number of clashes | Strain Energy | Confidence Score | Rank of sdf | _timestamp | _runtime | _step | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2v86_protein_processed | 4/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -4.772636 | 8 | 4.767310 | -0.84 | rank1_confidence-0.84 | 1.712194e+09 | 68.381843 | 8 |
| 1 | 3ant_protein_processed | 4/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -6.526974 | 5 | 5.131837 | -0.69 | rank1_confidence-0.69 | 1.712192e+09 | 70.197422 | 8 |
| 2 | 6cj5_protein_processed | 4/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -6.489578 | 1 | 9.185916 | -0.58 | rank1_confidence-0.58 | 1.712171e+09 | 27.180906 | 8 |
| 3 | 5wae_protein_processed | 4/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -4.399126 | 3 | 7.590899 | -0.56 | rank1_confidence-0.56 | 1.712178e+09 | 46.549621 | 8 |
| 4 | 4erf_protein_processed | 4/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -5.251450 | 8 | 3.243217 | -0.28 | rank1_confidence-0.28 | 1.712197e+09 | 75.663525 | 8 |
Total number of data points for recursion step 5/5: 2763 First few rows of the dataframe for recursion step 5/5:
| Protein ID | Recursion Step Done | Ligand Description | Binding Affinity (kcal/mol) | Number of clashes | Strain Energy | Confidence Score | Rank of sdf | _timestamp | _runtime | _step | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3f18_protein_processed | 5/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -6.151046 | 3 | 6.017198 | -0.32 | rank1_confidence-0.32 | 1.712175e+09 | 33.212631 | 8 |
| 1 | 1sts_protein_processed | 5/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -5.828997 | 8 | 12.178389 | -0.87 | rank1_confidence-0.87 | 1.712170e+09 | 25.128030 | 8 |
| 2 | 3b8q_protein_processed | 5/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -6.206666 | 15 | 9.867155 | -0.16 | rank1_confidence-0.16 | 1.712179e+09 | 46.489564 | 8 |
| 3 | 4az2_protein_processed | 5/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -7.062797 | 5 | 5.276325 | 0.35 | rank1_confidence0.35 | 1.712173e+09 | 31.939743 | 8 |
| 4 | 5xjm_protein_processed | 5/5 | /p/project/hai_bimsb_dock_drug/Arcas_Stage_1/R... | -5.818565 | 7 | 7.974301 | -0.30 | rank1_confidence-0.30 | 1.712181e+09 | 60.047276 | 8 |
Data Distributions of Each Recursive Step¶
In [11]:
# Function to plot distributions of metrics from a given dataset
def plot_metrics_distributions(data, figure_title):
# Set the overall aesthetics
sns.set(style="whitegrid", palette="pastel")
# Prepare the figure with a larger size for better readability
fig, axes = plt.subplots(2, 2, figsize=(16, 12)) # Increased figure size for better legibility
fig.suptitle(figure_title, fontsize=20, fontweight='bold')
# Define a common bin size for comparable scales
common_bins = 120 # Example bin size, adjust based on your data distribution
# List of metrics to plot
metrics = ['Binding Affinity (kcal/mol)', 'Number of clashes', 'Strain Energy', 'Confidence Score']
colors = ['olive', 'gold', 'teal', 'purple']
# Plotting distributions
for i, metric in enumerate(metrics):
ax = axes.flat[i]
sns.histplot(data=data, x=metric, kde=True, ax=ax, color=colors[i], bins=common_bins)
ax.set_title(metric, fontsize=14)
ax.grid(True)
# Calculate mean and std, and annotate on the plots
mean_value = data[metric].mean()
std_value = data[metric].std()
ax.axvline(mean_value, color='magenta', linestyle='dashed', linewidth=2)
ax.text(mean_value + std_value, 0.7, f'Mean: {mean_value:.2f}\nStd: {std_value:.2f}', color='magenta', ha='left')
# Adjust layout for a clean look and ensure the titles and labels don't overlap
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
# Show plot
plt.show()
# Loop over the recursion steps and plot for each step
for step in range(1, 6):
file_path = f'{output_file_template.format(step)}'
data = pd.read_csv(file_path)
num_rows = data.shape[0]
print(f'Total number of data points for recursion step {step}/5: {num_rows}')
plot_metrics_distributions(data, f'Distribution of Various Metrics for Recursion Step {step}/5')
Total number of data points for recursion step 1/5: 2768
Total number of data points for recursion step 2/5: 2766
Total number of data points for recursion step 3/5: 2764
Total number of data points for recursion step 4/5: 2763
Total number of data points for recursion step 5/5: 2763
Data Distribution Ranges of Each Recursive Step¶
In [12]:
# Function to plot histograms with readable ranges for a given dataset
def plot_histograms_with_readable_ranges(data, title_suffix):
# Enhancing overall aesthetics
sns.set(style="whitegrid", palette="pastel")
# Initializing a larger figure for clearer detail
fig, axes = plt.subplots(4, 1, figsize=(10, 15)) # Using a vertical layout for better x-axis label readability
# Titles and customization for improved readability
fig.suptitle(f'Distribution of Selected Metrics with Readable Ranges - {title_suffix}', fontsize=20, fontweight='bold')
# Adjusting bin sizes and x-axis limits for clarity
metrics_info = {
'Binding Affinity (kcal/mol)': {'bins': 20, 'ax': axes[0], 'xlim': (-20, 20)},
'Strain Energy': {'bins': 20, 'ax': axes[1], 'xlim': (-20, 20)},
'Number of clashes': {'bins': 20, 'ax': axes[2], 'xlim': (0, 20)},
'Confidence Score': {'bins': 20, 'ax': axes[3], 'xlim': (-10, 10)}
}
for metric, info in metrics_info.items():
sns.histplot(data=data, x=metric, kde=True, ax=info['ax'], bins=info['bins'], color='skyblue')
info['ax'].set_title(metric, fontsize=14)
info['ax'].set_xlabel(metric, fontsize=12)
info['ax'].set_ylabel('Frequency', fontsize=12)
info['ax'].set_xlim(info['xlim']) # Adjusting x-axis limits for focusing on interesting ranges
info['ax'].tick_params(axis='x', labelsize=10, rotation=45)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
# Loop over the recursion steps and plot for each step
for step in range(1, 6):
file_path = f'{output_file_template.format(step)}'
data = pd.read_csv(file_path)
num_rows = data.shape[0]
print(f'Total number of data points for recursion step {step}/5: {num_rows}')
plot_histograms_with_readable_ranges(data, f'Recursion Step {step}/5')
Total number of data points for recursion step 1/5: 2768
Total number of data points for recursion step 2/5: 2766
Total number of data points for recursion step 3/5: 2764
Total number of data points for recursion step 4/5: 2763
Total number of data points for recursion step 5/5: 2763
Correlation Between Confidence Score vs Physical Scores (Binding Affinity, Strain Energy, Number of Clashes) of Each Recursive Step¶
- This is all data without selecting range values
In [13]:
def plot_correlations(data, title_suffix):
# Plotting correlations for the specified pairs of metrics
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle(f'Correlations Between Metrics - {title_suffix}', fontsize=20, fontweight='bold')
# Binding Affinity (kcal/mol) vs Confidence Score
sns.scatterplot(data=data, x='Binding Affinity (kcal/mol)', y='Confidence Score', ax=axes[0], color='skyblue')
axes[0].set_title('Binding Affinity vs Confidence Score', fontsize=14)
axes[0].set_xlabel('Binding Affinity (kcal/mol)', fontsize=12)
axes[0].set_ylabel('Confidence Score', fontsize=12)
# Strain Energy vs Confidence Score
sns.scatterplot(data=data, x='Strain Energy', y='Confidence Score', ax=axes[1], color='lightgreen')
axes[1].set_title('Strain Energy vs Confidence Score', fontsize=14)
axes[1].set_xlabel('Strain Energy', fontsize=12)
axes[1].set_ylabel('Confidence Score', fontsize=12)
# Number of clashes vs Confidence Score
sns.scatterplot(data=data, x='Number of clashes', y='Confidence Score', ax=axes[2], color='salmon')
axes[2].set_title('Number of Clashes vs Confidence Score', fontsize=14)
axes[2].set_xlabel('Number of Clashes', fontsize=12)
axes[2].set_ylabel('Confidence Score', fontsize=12)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
# Loop over the recursion steps and plot correlations for each step
for step in range(1, 6):
file_path = f'{output_file_template.format(step)}'
data = pd.read_csv(file_path)
num_rows = data.shape[0]
print(f'Total number of data points for recursion step {step}/5: {num_rows}')
plot_correlations(data, f'Recursion Step {step}/5')
Total number of data points for recursion step 1/5: 2768
Total number of data points for recursion step 2/5: 2766
Total number of data points for recursion step 3/5: 2764
Total number of data points for recursion step 4/5: 2763
Total number of data points for recursion step 5/5: 2763
Correlation Between Confidence Score vs Physical Scores (Binding Affinity, Strain Energy, Number of Clashes) of Each Recursive Step¶
- This is all data with selecting range values
In [14]:
def filter_and_plot_correlations(data, title_suffix):
# Filter the data for the given ranges
filtered_data = data[
(data['Confidence Score'] >= -5) & (data['Confidence Score'] <= 5) &
(data['Binding Affinity (kcal/mol)'] >= -20) & (data['Binding Affinity (kcal/mol)'] <= 20) &
(data['Strain Energy'] >= -20) & (data['Strain Energy'] <= 20)
]
# Plotting correlations for the specified pairs of metrics with all adjusted ranges
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle(f'Correlations Between Metrics (Adjusted Ranges) - {title_suffix}', fontsize=20, fontweight='bold')
# Binding Affinity (kcal/mol) vs Confidence Score
sns.scatterplot(data=filtered_data, x='Binding Affinity (kcal/mol)', y='Confidence Score', ax=axes[0], color='skyblue')
axes[0].set_title('Binding Affinity vs Confidence Score', fontsize=14)
# Strain Energy vs Confidence Score
sns.scatterplot(data=filtered_data, x='Strain Energy', y='Confidence Score', ax=axes[1], color='lightgreen')
axes[1].set_title('Strain Energy vs Confidence Score', fontsize=14)
# Number of clashes vs Confidence Score
sns.scatterplot(data=filtered_data, x='Number of clashes', y='Confidence Score', ax=axes[2], color='salmon')
axes[2].set_title('Number of Clashes vs Confidence Score', fontsize=14)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
# Loop over the recursion steps and apply the filter and plotting function
for step in range(1, 6):
file_path = f'{output_file_template.format(step)}'
data = pd.read_csv(file_path)
num_rows = data.shape[0]
print(f'Total number of data points for recursion step {step}/5: {num_rows}')
filter_and_plot_correlations(data, f'Recursion Step {step}/5')
Total number of data points for recursion step 1/5: 2768
Total number of data points for recursion step 2/5: 2766
Total number of data points for recursion step 3/5: 2764
Total number of data points for recursion step 4/5: 2763
Total number of data points for recursion step 5/5: 2763
Correlation Between Confidence Score vs Physical Scores (Binding Affinity, Strain Energy, Number of Clashes) of Each Recursive Step¶
- This is all data with selecting range values
- Best lines with
R^2values
In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import statsmodels.api as sm
def plot_correlation_with_fit(data, recursion_step):
# Filter data based on the specified ranges
filtered_data = data[
(data['Confidence Score'] >= -5) & (data['Confidence Score'] <= 5) &
(data['Binding Affinity (kcal/mol)'] >= -20) & (data['Binding Affinity (kcal/mol)'] <= 20) &
(data['Strain Energy'] >= -20) & (data['Strain Energy'] <= 20)
]
# Preparing the figure
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle(f'Correlations and Linear Fits (Adjusted Ranges) - Recursion Step {recursion_step}', fontsize=20, fontweight='bold')
# Defining metrics pairs for correlation
metrics_pairs = [
('Binding Affinity (kcal/mol)', 'Confidence Score'),
('Strain Energy', 'Confidence Score'),
('Number of clashes', 'Confidence Score')
]
colors = ['skyblue', 'lightgreen', 'salmon']
for i, (x_metric, y_metric) in enumerate(metrics_pairs):
# Scatter plot
sns.scatterplot(data=filtered_data, x=x_metric, y=y_metric, ax=axes[i], color=colors[i])
# Robust linear model
X = sm.add_constant(filtered_data[x_metric]) # Adding a constant for the intercept
robust_model = sm.RLM(filtered_data[y_metric], X, M=sm.robust.norms.HuberT())
robust_results = robust_model.fit()
# Calculate R^2 score for robust model
y_pred_robust = robust_results.predict(X)
r2_robust = r2_score(filtered_data[y_metric], y_pred_robust)
# Linear model
coef = np.polyfit(filtered_data[x_metric], filtered_data[y_metric], 1)
poly1d_fn = np.poly1d(coef)
# Calculate R^2 score for linear model
y_pred_linear = poly1d_fn(filtered_data[x_metric])
r2_linear = r2_score(filtered_data[y_metric], y_pred_linear)
# Plot robust fit and linear fit
sns.lineplot(x=filtered_data[x_metric], y=y_pred_robust, ax=axes[i], color='red', label=f'Robust R² = {r2_robust:.2f}')
sns.lineplot(x=filtered_data[x_metric], y=poly1d_fn(filtered_data[x_metric]), ax=axes[i], color='blue', label=f'Linear R² = {r2_linear:.2f}', linestyle='--')
axes[i].set_title(f'{x_metric} vs {y_metric}', fontsize=14)
axes[i].set_xlabel(x_metric, fontsize=12)
axes[i].set_ylabel(y_metric, fontsize=12)
axes[i].legend()
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
# Loop over the recursion steps and plot for each step
for step in range(1, 6):
file_path = f'{output_file_template.format(step)}'
data = pd.read_csv(file_path)
num_rows = data.shape[0]
print(f'Total number of data points for recursion step {step}/5: {num_rows}')
plot_correlation_with_fit(data, step)
Total number of data points for recursion step 1/5: 2768
Total number of data points for recursion step 2/5: 2766
Total number of data points for recursion step 3/5: 2764
Total number of data points for recursion step 4/5: 2763
Total number of data points for recursion step 5/5: 2763
Box Plots (Binding Affinity, Strain Energy, Number of Clashes) of Each Recursive Step¶
In [16]:
# Function to remove outliers using the IQR method
def remove_outliers(df, column_name):
Q1 = df[column_name].quantile(0.1)
Q3 = df[column_name].quantile(0.9)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]
# Function to plot a fancy boxplot for a given column
def plot_fancy_boxplot(data, column_name, recursion_steps, color):
cleaned_data = remove_outliers(data, column_name)
plt.figure(figsize=(12, 8))
ax = sns.boxplot(x='Recursion Step', y=column_name, data=cleaned_data, fliersize=4, linewidth=2.5, color=color)
sns.stripplot(x='Recursion Step', y=column_name, data=cleaned_data, jitter=True, size=4, color='gray', alpha=0.5)
plt.title(f'{column_name} across Recursion Steps (Without Outliers)', fontsize=18)
plt.xlabel('Recursion Step', fontsize=14)
plt.ylabel(column_name, fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
sns.despine(offset=10, trim=True)
plt.tight_layout()
plt.show()
# Load the data and concatenate it with recursion step labels
base_dir = '/fast/AG_Akalin/asarigun/Arcas_Stage_1/ROOF/COMPASS/experiments/PDBBind/data/'
output_file_template = 'filtered_proteins_from_summary_rec{}.csv'
all_data = pd.DataFrame()
for step in range(1, 6):
file_path = f'{base_dir}{output_file_template.format(step)}'
df = pd.read_csv(file_path)
df['Recursion Step'] = f'{step}/5'
all_data = pd.concat([all_data, df])
# List of columns to create boxplots for
columns_to_plot = [
'Binding Affinity (kcal/mol)',
'Confidence Score',
'Strain Energy',
'Number of clashes'
]
colors = {
'Binding Affinity (kcal/mol)': 'skyblue',
'Confidence Score': 'lightgreen',
'Strain Energy': 'lightcoral',
'Number of clashes': 'violet'
}
# Plot fancy boxplots for all specified columns with colors
for column in columns_to_plot:
plot_fancy_boxplot(all_data, column, range(1, 6), colors[column])
Violin Plots (Binding Affinity, Strain Energy, Number of Clashes) of Each Recursive Step¶
In [17]:
# Function to plot a fancy violin plot for a given column
def plot_fancy_violin_plot(data, column_name, recursion_steps, color):
cleaned_data = remove_outliers(data, column_name)
plt.figure(figsize=(12, 8))
sns.violinplot(x='Recursion Step', y=column_name, data=cleaned_data, inner='quartile', linewidth=2.5, color=color)
sns.stripplot(x='Recursion Step', y=column_name, data=cleaned_data, jitter=True, size=4, color='grey', alpha=0.6)
plt.title(f'{column_name} across Recursion Steps (Without Outliers)', fontsize=18)
plt.xlabel('Recursion Step', fontsize=14)
plt.ylabel(column_name, fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
sns.despine(offset=10, trim=True)
plt.tight_layout()
plt.show()
# Assuming 'base_dir' and 'output_file_template' have been defined and the data has been loaded.
# Assuming 'all_data' DataFrame is ready and contains 'Recursion Step' column
# List of columns to create violin plots for
columns_to_plot = [
'Binding Affinity (kcal/mol)',
'Confidence Score',
'Strain Energy',
'Number of clashes'
]
colors = {
'Binding Affinity (kcal/mol)': 'skyblue',
'Confidence Score': 'lightgreen',
'Strain Energy': 'lightcoral',
'Number of clashes': 'violet'
}
# Plot fancy boxplots for all specified columns with colors
for column in columns_to_plot:
plot_fancy_violin_plot(all_data, column, range(1, 6), colors[column])
Comparative Analysis (t-tests, ANOVA, Kruskal-Wallis test, Tukey HSD test)¶
In [18]:
import scipy.stats as stats
# Function to perform t-tests between recursion steps for a given column
def perform_t_tests(data, column_name):
unique_steps = data['Recursion Step'].unique()
for i in range(len(unique_steps)):
for j in range(i+1, len(unique_steps)):
step_i = data[data['Recursion Step'] == unique_steps[i]][column_name]
step_j = data[data['Recursion Step'] == unique_steps[j]][column_name]
t_stat, p_val = stats.ttest_ind(step_i, step_j, equal_var=False)
print(f"T-test between {unique_steps[i]} and {unique_steps[j]} for {column_name}:")
print(f"T-statistic: {t_stat}, P-value: {p_val}")
print("\n")
# Function to perform ANOVA across all recursion steps for a given column
def perform_anova(data, column_name):
data_groups = [data[data['Recursion Step'] == step][column_name] for step in data['Recursion Step'].unique()]
f_stat, p_val = stats.f_oneway(*data_groups)
print(f"ANOVA for {column_name}:")
print(f"F-statistic: {f_stat}, P-value: {p_val}")
print("\n")
# Function to perform non-parametric Kruskal-Wallis test across all recursion steps for a given column
def perform_kruskal_wallis(data, column_name):
data_groups = [data[data['Recursion Step'] == step][column_name] for step in data['Recursion Step'].unique()]
h_stat, p_val = stats.kruskal(*data_groups)
print(f"Kruskal-Wallis test for {column_name}:")
print(f"H-statistic: {h_stat}, P-value: {p_val}")
print("\n")
# Assuming 'all_data' DataFrame is ready and contains 'Recursion Step' column
column_to_analyze = [
'Binding Affinity (kcal/mol)',
'Confidence Score',
'Strain Energy',
'Number of clashes'
]
for column in column_to_analyze:
# Perform t-tests
perform_t_tests(all_data, column)
# Perform ANOVA
perform_anova(all_data, column)
# Perform Kruskal-Wallis test
perform_kruskal_wallis(all_data, column)
T-test between 1/5 and 2/5 for Binding Affinity (kcal/mol): T-statistic: 0.36898556049411196, P-value: 0.7121537928083277 T-test between 1/5 and 3/5 for Binding Affinity (kcal/mol): T-statistic: 0.2329064062917231, P-value: 0.8158434981121949 T-test between 1/5 and 4/5 for Binding Affinity (kcal/mol): T-statistic: 0.4531290450527204, P-value: 0.650475128591714 T-test between 1/5 and 5/5 for Binding Affinity (kcal/mol): T-statistic: -1.5127641352192291, P-value: 0.13039850905571027 T-test between 2/5 and 3/5 for Binding Affinity (kcal/mol): T-statistic: -0.16430827239685758, P-value: 0.8694944783543672 T-test between 2/5 and 4/5 for Binding Affinity (kcal/mol): T-statistic: 0.09925940992664334, P-value: 0.9209359051841861 T-test between 2/5 and 5/5 for Binding Affinity (kcal/mol): T-statistic: -2.175000921544778, P-value: 0.029672907378204653 T-test between 3/5 and 4/5 for Binding Affinity (kcal/mol): T-statistic: 0.26470682645715254, P-value: 0.7912451937911257 T-test between 3/5 and 5/5 for Binding Affinity (kcal/mol): T-statistic: -2.0360095269038796, P-value: 0.04179786691038868 T-test between 4/5 and 5/5 for Binding Affinity (kcal/mol): T-statistic: -2.2721968114625497, P-value: 0.023113476650745327 ANOVA for Binding Affinity (kcal/mol): F-statistic: 1.6557797274380939, P-value: 0.1572631946602122 Kruskal-Wallis test for Binding Affinity (kcal/mol): H-statistic: 3.2204392362837098, P-value: 0.5216359983730945 T-test between 1/5 and 2/5 for Confidence Score: T-statistic: -0.15066092884768245, P-value: 0.8802486885982177 T-test between 1/5 and 3/5 for Confidence Score: T-statistic: 0.0974582340208248, P-value: 0.9223660326918778 T-test between 1/5 and 4/5 for Confidence Score: T-statistic: -1.2021169715379008, P-value: 0.22937295535424126 T-test between 1/5 and 5/5 for Confidence Score: T-statistic: 0.45502112776988995, P-value: 0.6491119113901627 T-test between 2/5 and 3/5 for Confidence Score: T-statistic: 0.2486837271791837, P-value: 0.8036146754581329 T-test between 2/5 and 4/5 for Confidence Score: T-statistic: -1.0728082156606968, P-value: 0.2834070226204205 T-test between 2/5 and 5/5 for Confidence Score: T-statistic: 0.6048050616210732, P-value: 0.5453333696863079 T-test between 3/5 and 4/5 for Confidence Score: T-statistic: -1.2890862098611735, P-value: 0.19742563656462017 T-test between 3/5 and 5/5 for Confidence Score: T-statistic: 0.3598016692247383, P-value: 0.7190092149271758 T-test between 4/5 and 5/5 for Confidence Score: T-statistic: 1.5884891666047898, P-value: 0.11223611580529022 ANOVA for Confidence Score: F-statistic: 0.86888165238881, P-value: 0.4816376788466181 Kruskal-Wallis test for Confidence Score: H-statistic: 1.2483887886284888, P-value: 0.8700692621758888 T-test between 1/5 and 2/5 for Strain Energy: T-statistic: -0.7319547567507239, P-value: 0.46422721577452497 T-test between 1/5 and 3/5 for Strain Energy: T-statistic: -0.5132622222426472, P-value: 0.6077884132139629 T-test between 1/5 and 4/5 for Strain Energy: T-statistic: -0.4822940091033011, P-value: 0.6296161947652388 T-test between 1/5 and 5/5 for Strain Energy: T-statistic: -0.48009629133263887, P-value: 0.6311781615396754 T-test between 2/5 and 3/5 for Strain Energy: T-statistic: 0.21623722388874564, P-value: 0.8288108200331613 T-test between 2/5 and 4/5 for Strain Energy: T-statistic: 0.2446543598456842, P-value: 0.8067331571943515 T-test between 2/5 and 5/5 for Strain Energy: T-statistic: 0.20210351292805548, P-value: 0.8398433899859543 T-test between 3/5 and 4/5 for Strain Energy: T-statistic: 0.029091411838153763, P-value: 0.976792736053002 T-test between 3/5 and 5/5 for Strain Energy: T-statistic: 0.0001511430182507797, P-value: 0.9998794108632965 T-test between 4/5 and 5/5 for Strain Energy: T-statistic: -0.0270598854097343, P-value: 0.9784129609086499 ANOVA for Strain Energy: F-statistic: 0.13782524944989935, P-value: 0.968315439830812 Kruskal-Wallis test for Strain Energy: H-statistic: 1.9636033513015718, P-value: 0.7424532988249419 T-test between 1/5 and 2/5 for Number of clashes: T-statistic: -0.39527331902194923, P-value: 0.6926564286386435 T-test between 1/5 and 3/5 for Number of clashes: T-statistic: 1.1305238829657684, P-value: 0.25830457195463846 T-test between 1/5 and 4/5 for Number of clashes: T-statistic: -0.8255911580017684, P-value: 0.40907167376338827 T-test between 1/5 and 5/5 for Number of clashes: T-statistic: -8.307516039441273, P-value: 1.2387584781813465e-16 T-test between 2/5 and 3/5 for Number of clashes: T-statistic: 1.5267133888223372, P-value: 0.12688951090686226 T-test between 2/5 and 4/5 for Number of clashes: T-statistic: -0.42865476661795643, P-value: 0.6681911890216506 T-test between 2/5 and 5/5 for Number of clashes: T-statistic: -7.969065339604163, P-value: 1.9553513590135814e-15 T-test between 3/5 and 4/5 for Number of clashes: T-statistic: -1.9628420446240016, P-value: 0.049714630879825625 T-test between 3/5 and 5/5 for Number of clashes: T-statistic: -9.288209390147173, P-value: 2.2696235084951756e-20 T-test between 4/5 and 5/5 for Number of clashes: T-statistic: -7.626390730307421, P-value: 2.861759313341065e-14 ANOVA for Number of clashes: F-statistic: 34.10450911333242, P-value: 2.2825366164770107e-28 Kruskal-Wallis test for Number of clashes: H-statistic: 44.855516114925344, P-value: 4.260682422770966e-09
In [19]:
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import pandas as pd
# Assuming 'all_data' is your DataFrame and it includes 'Recursion Step' and the metric of interest
def tukey_hsd_posthoc_test(data, column_name):
# Perform the Tukey HSD test
tukey_result = pairwise_tukeyhsd(endog=data[column_name],
groups=data['Recursion Step'],
alpha=0.05) # alpha is the significance level
# Print the summary of the test results
print(tukey_result)
# Plot the results
tukey_result.plot_simultaneous()
plt.title(f'Tukey HSD test for {column_name}')
plt.show()
# Load your dataset
# Ensure 'all_data' DataFrame is prepared with 'Recursion Step' and the metrics columns
# Define a function to annotate statistical significance on plots
def annotate_significance(ax, x1, x2, max_y, text):
y = max_y + max_y * 0.02 # Slightly above the highest data point
ax.plot([x1, x1, x2, x2], [y, y+1, y+1, y], lw=1.5, c='black')
ax.text((x1+x2)*.5, y+1, text, ha='center', va='bottom', color='black')
# Function to visualize and statistically compare groups
def visualize_and_compare(data, column_name):
cleaned_data = remove_outliers(data, column_name)
# First, visualize with boxplot
plt.figure(figsize=(12, 8))
ax = sns.boxplot(x='Recursion Step', y=column_name, data=cleaned_data, palette="Set3")
sns.stripplot(x='Recursion Step', y=column_name, data=cleaned_data, color='grey', alpha=0.6, jitter=True)
plt.title(f'{column_name} across Recursion Steps')
plt.show()
# Then, visualize with violin plot
plt.figure(figsize=(12, 8))
sns.violinplot(x='Recursion Step', y=column_name, data=cleaned_data, inner='quartile', palette="Set2")
plt.title(f'Distribution of {column_name} across Recursion Steps')
plt.show()
# Perform ANOVA or Kruskal-Wallis test
groups = [group[column_name].dropna() for name, group in data.groupby('Recursion Step')]
if all(group.size > 1 for group in groups): # Ensure all groups have more than one observation
f_val, p_val = stats.f_oneway(*groups)
print(f"ANOVA for {column_name}: F-value = {f_val}, P-value = {p_val}")
if p_val < 0.05:
print("Significant differences found. Consider conducting post-hoc tests for pairwise comparisons.")
print("Conducting post-hoc test...")
tukey_hsd_posthoc_test(cleaned_data, column_name)
else:
print("No significant differences found across Recursion Steps.")
else:
print("Not all groups have sufficient data for ANOVA.")
column_to_analyze = [
'Binding Affinity (kcal/mol)',
'Confidence Score',
'Strain Energy',
'Number of clashes'
]
for column in column_to_analyze:
visualize_and_compare(all_data, column)
# Note: This is a basic implementation. For post-hoc tests, consider using Tukey's HSD or pairwise t-tests with Bonferroni correction.
/tmp/ipykernel_139294/498102362.py:40: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. ax = sns.boxplot(x='Recursion Step', y=column_name, data=cleaned_data, palette="Set3")
/tmp/ipykernel_139294/498102362.py:47: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.violinplot(x='Recursion Step', y=column_name, data=cleaned_data, inner='quartile', palette="Set2")
ANOVA for Binding Affinity (kcal/mol): F-value = 1.6557797274380939, P-value = 0.1572631946602122 No significant differences found across Recursion Steps.
/tmp/ipykernel_139294/498102362.py:40: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. ax = sns.boxplot(x='Recursion Step', y=column_name, data=cleaned_data, palette="Set3")
/tmp/ipykernel_139294/498102362.py:47: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.violinplot(x='Recursion Step', y=column_name, data=cleaned_data, inner='quartile', palette="Set2")
ANOVA for Confidence Score: F-value = 0.86888165238881, P-value = 0.4816376788466181 No significant differences found across Recursion Steps.
/tmp/ipykernel_139294/498102362.py:40: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. ax = sns.boxplot(x='Recursion Step', y=column_name, data=cleaned_data, palette="Set3")
/tmp/ipykernel_139294/498102362.py:47: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.violinplot(x='Recursion Step', y=column_name, data=cleaned_data, inner='quartile', palette="Set2")
ANOVA for Strain Energy: F-value = 0.13782524944989935, P-value = 0.968315439830812 No significant differences found across Recursion Steps.
/tmp/ipykernel_139294/498102362.py:40: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. ax = sns.boxplot(x='Recursion Step', y=column_name, data=cleaned_data, palette="Set3")
/tmp/ipykernel_139294/498102362.py:47: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.violinplot(x='Recursion Step', y=column_name, data=cleaned_data, inner='quartile', palette="Set2")
ANOVA for Number of clashes: F-value = 34.10450911333242, P-value = 2.2825366164770107e-28 Significant differences found. Consider conducting post-hoc tests for pairwise comparisons. Conducting post-hoc test... Multiple Comparison of Means - Tukey HSD, FWER=0.05 =================================================== group1 group2 meandiff p-adj lower upper reject --------------------------------------------------- 1/5 2/5 0.046 0.9859 -0.1976 0.2896 False 1/5 3/5 -0.0812 0.8935 -0.3248 0.1624 False 1/5 4/5 0.0817 0.8912 -0.1619 0.3254 False 1/5 5/5 0.832 0.0 0.5884 1.0756 True 2/5 3/5 -0.1272 0.6121 -0.3708 0.1165 False 2/5 4/5 0.0357 0.9946 -0.2079 0.2794 False 2/5 5/5 0.786 0.0 0.5424 1.0297 True 3/5 4/5 0.1629 0.3598 -0.0808 0.4066 False 3/5 5/5 0.9132 0.0 0.6695 1.1569 True 4/5 5/5 0.7503 0.0 0.5065 0.994 True ---------------------------------------------------
PCA¶
In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Assuming 'all_data' is your DataFrame
# Outlier removal function
def remove_outliers(df, column_name):
Q1 = df[column_name].quantile(0.1)
Q3 = df[column_name].quantile(0.9)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]
# List of features to include in PCA
features = ['Binding Affinity (kcal/mol)', 'Confidence Score', 'Strain Energy', 'Number of clashes']
# Apply outlier removal for each feature
cleaned_data = pd.DataFrame()
for feature in features:
if cleaned_data.empty:
cleaned_data = remove_outliers(all_data, feature)
else:
cleaned_data = remove_outliers(cleaned_data, feature)
# Standardizing the features
x = cleaned_data.loc[:, features].values
x = StandardScaler().fit_transform(x)
# Performing PCA
pca = PCA(n_components=2) # You can adjust n_components based on your needs
principalComponents = pca.fit_transform(x)
# Create a DataFrame with the principal components
principalDf = pd.DataFrame(data=principalComponents, columns=['Principal Component 1', 'Principal Component 2'])
# Visualizing the PCA results
plt.figure(figsize=(8,6))
plt.scatter(principalDf['Principal Component 1'], principalDf['Principal Component 2'])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('2 component PCA')
plt.grid(True)
plt.show()
# Displaying explained variance ratio
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
Explained variance ratio: [0.37282531 0.25644431]
One-Sample t-Test¶
In [21]:
from scipy import stats
# One-sample t-test example
def one_sample_t_test(data, column_name, population_mean):
"""
Test if the mean of the sample in `column_name` is equal to `population_mean`.
"""
t_stat, p_val = stats.ttest_1samp(data[column_name].dropna(), population_mean)
print(f"One-sample t-test for {column_name}:")
print(f"T-statistic: {t_stat}, P-value: {p_val}")
features = ['Binding Affinity (kcal/mol)', 'Confidence Score', 'Strain Energy', 'Number of clashes']
for feat in features:
one_sample_t_test(all_data, feat, 0)
One-sample t-test for Binding Affinity (kcal/mol): T-statistic: -495.84507190316606, P-value: 0.0 One-sample t-test for Confidence Score: T-statistic: -114.43554938475592, P-value: 0.0 One-sample t-test for Strain Energy: T-statistic: 432.11095456705647, P-value: 0.0 One-sample t-test for Number of clashes: T-statistic: 223.89591635586228, P-value: 0.0
ANOVA Test¶
In [22]:
def anova_test(data, column_name):
"""
Test if at least one group mean of `column_name` is different from the others.
"""
group_data = [group.dropna() for name, group in data.groupby('Recursion Step')[column_name]]
f_stat, p_val = stats.f_oneway(*group_data)
print(f"ANOVA for {column_name}:")
print(f"F-statistic: {f_stat}, P-value: {p_val}")
for feat in features:
# Example usage for 'Binding Affinity (kcal/mol)' across recursion steps
anova_test(all_data, feat)
ANOVA for Binding Affinity (kcal/mol): F-statistic: 1.6557797274380939, P-value: 0.1572631946602122 ANOVA for Confidence Score: F-statistic: 0.86888165238881, P-value: 0.4816376788466181 ANOVA for Strain Energy: F-statistic: 0.13782524944989935, P-value: 0.968315439830812 ANOVA for Number of clashes: F-statistic: 34.10450911333242, P-value: 2.2825366164770107e-28